# 20180609
# author：葛木瓜
# 实现从西安物价局官网抓取房价包

from urllib import request
from selenium import webdriver
import zipfile
import os

url = 'http://wjj.xa.gov.cn/ptl/def/def/index_1285_3887_ci_trid_4416419.html'
# driver = webdriver.Firefox()
driver = webdriver.PhantomJS(executable_path='D:\\Python36\\phantomjs.exe')
driver.get(url)
driver.implicitly_wait(5)
cur_path = os.path.dirname(__file__)


def get_url():

    """
    获取今天url并写入txt
    :return:
    """
    # cur_date = strftime('%Y-%m-%d')
    # cur_date = '2018-05-18'
    driver.switch_to.frame('iframecenter')
    date_list = driver.find_elements_by_xpath('.//*[@id="tablelist"]/tbody/tr/td[3]/span')
    fw = open(cur_path + "/house_prices/url_list.txt", 'a')
    fr = open(cur_path + "/house_prices/url_list.txt", 'r')
    download_flag = 0
    # print(fr.readlines())
    f_list = fr.readlines()
    # print(f_list)
    if len(f_list) == 0:
        for i in reversed(range(len(date_list))):   # 写入顺序为时间逆序
            fw.writelines(driver.find_elements_by_id('linkId')[i].get_attribute('href'))
            fw.write('\n')
        download_flag = 1
    else:
        f_latest_num = int(f_list[-1].split('=')[1])
        for i in reversed(range(0, 5)):   # 获取最新5条
            latest_url = driver.find_elements_by_id('linkId')[i].get_attribute('href')
            latest_url_num = int(latest_url.split('=')[1])
            if f_latest_num < latest_url_num:
                fw.writelines(latest_url)
                fw.write('\n')
                download_flag = download_flag + 1

    fw.close()
    return download_flag


def download_zip():

    """
    从txt中读取url并下载zip
    :return:
    """
    flag = get_url()
    if flag == 1:
        fr = open(cur_path + "/house_prices/url_list.txt", 'r')
        all_lines = fr.readlines()
        for line_url in all_lines:
            driver.get(line_url)
            driver.implicitly_wait(15)
            driver.switch_to.frame('showconent1')
            download_url = driver.find_element_by_partial_link_text('商品住房价格')
            download_url = download_url.get_attribute('href')
            zipname = cur_path + '/house_prices/' + download_url.split('/')[6]
            filename = zipname.split('.')[0]
            request.urlretrieve(download_url, zipname)
            # 解压并删除压缩包
            try:
                with zipfile.ZipFile(zipname) as zfile:
                    zfile.extractall(path=filename)
                if os.path.exists(zipname):
                    os.remove(zipname)
            except zipfile.BadZipFile as e:
                print(filename + " is a bad zip file ,please check!")

        # 有更新数据打开文件夹
        os.system(cur_path + "/openFolder.bat")

    elif flag != 1 and flag != 0:
        fr = open(cur_path + "/house_prices/url_list.txt", 'r')
        all_lines = fr.readlines()
        for line_url in range(flag):
            driver.get(all_lines[-line_url-1])
            driver.implicitly_wait(15)
            driver.switch_to.frame('showconent1')
            # download_url = driver.find_element_by_xpath('/html/body/div/div[3]/p[3]/a')
            download_url = driver.find_element_by_partial_link_text('商品住房价格')
            download_url = download_url.get_attribute('href')
            zipname = cur_path + '/house_prices/' + download_url.split('/')[6]
            filename = zipname.split('.')[0]
            request.urlretrieve(download_url, zipname)
            # 解压并删除压缩包
            try:
                with zipfile.ZipFile(zipname) as zfile:
                    zfile.extractall(path=filename)
                if os.path.exists(zipname):
                    os.remove(zipname)
            except zipfile.BadZipFile as e:
                print(filename + " is a bad zip file ,please check!")

        # 有更新数据打开文件夹
        os.system(cur_path + "/openFolder.bat")


if __name__ == '__main__':

    download_zip()
    driver.quit()
    # print('Download Over !!!')



